Load necessary packages.
import numpy as np
import pandas as pd
import scanpy as sc
import anndata
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import phenograph
from sklearn.neighbors import NearestNeighbors
import scipy
from metacells.core import Metacells
from sklearn.metrics import silhouette_score
from copy import deepcopy
import random
import os
import sys
sys.path.append("/Users/rs3380/Desktop/python_funs/")
import utils
findfont: Font family ['Raleway'] not found. Falling back to DejaVu Sans. findfont: Font family ['Lato'] not found. Falling back to DejaVu Sans.
outbase = '/Users/rs3380/Dropbox/lung_tumor/new_folders/output/exp2/'
cols = utils.get_color_pal(100)
samples = pd.read_table('/Users/rs3380/Dropbox/lung_tumor/data/sampleinfo/samples_krasdt_2.txt',sep='\t')
samples
| SampleID | name | background | condition | replicate | experiment | |
|---|---|---|---|---|---|---|
| 0 | 2_DT_1_CD45_minus | 2_DT_1_CD45plus | CD45+ | DT | 1 | KRAS_DT_2 |
| 1 | 2_DT_1_CD45plus | 2_DT_1_CD45minus | CD45- | DT | 1 | KRAS_DT_2 |
| 2 | 2_DT_2_CD45minus | 2_DT_2_CD45minus | CD45- | DT | 2 | KRAS_DT_2 |
| 3 | 2_DT_2_CD45plus | 2_DT_2_CD45plus | CD45+ | DT | 2 | KRAS_DT_2 |
| 4 | 2_DT_3_CD45_minus | 2_DT_3_CD45minus | CD45- | DT | 3 | KRAS_DT_2 |
| 5 | 2_DT_3_CD45plus | 2_DT_3_CD45plus | CD45+ | DT | 3 | KRAS_DT_2 |
| 6 | 2_ctl_1_CD45minus | 2_ctl_1_CD45minus | CD45- | CTRL | 1 | KRAS_DT_2 |
| 7 | 2_ctl_1_CD45plus | 2_ctl_1_CD45plus | CD45+ | CTRL | 1 | KRAS_DT_2 |
| 8 | 2_ctl_2_CD45minus | 2_ctl_2_CD45minus | CD45- | CTRL | 2 | KRAS_DT_2 |
| 9 | 2_ctl_2_CD45plus | 2_ctl_2_CD45plus | CD45+ | CTRL | 2 | KRAS_DT_2 |
| 10 | 2_ctl_3_CD45plus | 2_ctl_3_CD45plus | CD45+ | CTRL | 3 | KRAS_DT_2 |
| 11 | ctl_3_CD45_minus | 2_ctl_3_CD45minus | CD45- | CTRL | 3 | KRAS_DT_2 |
# load all datasets
datadict = {}
for sid,sn in zip(samples['SampleID'],samples['name']):
print('reading ' + sid)
#print('/Users/rs3380/Dropbox/lung_tumor/data/SEQC/exp2/'+sid+'/'+sid+'_dense.csv')
counts = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/data/SEQC/exp2/'
+sid+'/'+sid+'_dense.csv',index_col=0)
counts = counts.astype(np.int32)
datadict[sn] = counts.loc[:,~(counts.columns == 'CLUSTER')]
reading 2_DT_1_CD45_minus reading 2_DT_1_CD45plus reading 2_DT_2_CD45minus reading 2_DT_2_CD45plus reading 2_DT_3_CD45_minus reading 2_DT_3_CD45plus reading 2_ctl_1_CD45minus reading 2_ctl_1_CD45plus reading 2_ctl_2_CD45minus reading 2_ctl_2_CD45plus reading 2_ctl_3_CD45plus reading ctl_3_CD45_minus
for key_vals in datadict.keys():
print([key_vals, datadict[key_vals].shape])
['2_DT_1_CD45plus', (3379, 15357)] ['2_DT_1_CD45minus', (4644, 18002)] ['2_DT_2_CD45minus', (2211, 16815)] ['2_DT_2_CD45plus', (2610, 15199)] ['2_DT_3_CD45minus', (1850, 16262)] ['2_DT_3_CD45plus', (3249, 15512)] ['2_ctl_1_CD45minus', (2679, 16622)] ['2_ctl_1_CD45plus', (4227, 15745)] ['2_ctl_2_CD45minus', (901, 13225)] ['2_ctl_2_CD45plus', (3400, 15554)] ['2_ctl_3_CD45plus', (2017, 15139)] ['2_ctl_3_CD45minus', (1695, 15810)]
for key_vals in datadict.keys():
a0123 = np.sum(datadict[key_vals], axis = 1)
print([key_vals, datadict[key_vals].shape,
np.median(a0123),
np.median(np.log10(a0123 + 1))])
['2_DT_1_CD45plus', (3379, 15357), 2955.0, 3.470704429722788] ['2_DT_1_CD45minus', (4644, 18002), 3857.0, 3.586362208718719] ['2_DT_2_CD45minus', (2211, 16815), 1626.0, 3.2113875529368587] ['2_DT_2_CD45plus', (2610, 15199), 2898.0, 3.462248112003213] ['2_DT_3_CD45minus', (1850, 16262), 1052.5, 3.0226333171130175] ['2_DT_3_CD45plus', (3249, 15512), 4383.0, 3.641870545476313] ['2_ctl_1_CD45minus', (2679, 16622), 2605.0, 3.415974411376566] ['2_ctl_1_CD45plus', (4227, 15745), 2908.0, 3.463743721247059] ['2_ctl_2_CD45minus', (901, 13225), 155.0, 2.1931245983544616] ['2_ctl_2_CD45plus', (3400, 15554), 3067.5, 3.486926121854383] ['2_ctl_3_CD45plus', (2017, 15139), 3306.0, 3.519434194913703] ['2_ctl_3_CD45minus', (1695, 15810), 2096.0, 3.3215984304653436]
## combine before filtering -- going to choose metric based on combined dataframe
# concatenate rows of cells into single dataframe
fullcounts_df = pd.concat(datadict.values(),axis=0)
# replace NAs with 0 in concat dataframe
fullcounts_df.fillna(0,inplace=True)
fullcounts_df.shape
(32862, 20102)
# Remove genes expressed in less than 10 cells:
gene_counts = (fullcounts_df > 0).sum()
counts_df = fullcounts_df.loc[:,gene_counts > 10]
counts_df.shape
(32862, 16464)
libsizes = counts_df.sum(axis=1)
normcount_df = counts_df.div(libsizes, axis=0).mul(np.median(libsizes), axis=0)
plt.figure(figsize = (8, 6))
plt.hist(np.log2(libsizes + 1), 100);
plt.axvline(np.log2(500 + 1), color = 'r')
plt.xlabel('log2 library size')
plt.ylabel('Frequency')
Text(0, 0.5, 'Frequency')
normlog_df = np.log2(normcount_df + 0.1)
# Re assign cell index to avoid repeatition
sampleinfo_df = samples.copy()
sampleinfo_df['ncells'] = [datadict[sn].shape[0] for sn in sampleinfo_df['name']]
cell_index = fullcounts_df.index
# 3 digit sample number added to the end
idx_end = sum([['{:03d}'.format(i)]*n for i,n in enumerate(sampleinfo_df['ncells'])],[])
new_index = pd.Index([int(str(ci)+e) for ci,e in zip(list(cell_index),idx_end)],dtype='int64')
libsizes.index = new_index
counts_df.index = new_index
normcount_df.index = new_index
normlog_df.index = new_index
nrepeats = sum([[i]*n for i,n in enumerate(sampleinfo_df['ncells'])],[])
cell_info = sampleinfo_df.loc[nrepeats,:].copy()
cell_info.index = new_index
cell_info['libsize'] = libsizes
LIBSELECT = cell_info['libsize'] > 500
sum(LIBSELECT)/32862
0.8400584261457003
filtered_info = cell_info.loc[LIBSELECT,:]
filtered_count = counts_df.loc[LIBSELECT,:]
gene_counts = (filtered_count > 0).sum()
## retain genes with presence in at least 10 cells
filtered_count = filtered_count.loc[:,gene_counts > 10]
filtered_count.shape
(27606, 16435)
filtered_libsizes = filtered_count.sum(axis=1)
filtered_normcount = filtered_count.div(filtered_libsizes, axis=0).mul(np.median(filtered_libsizes), axis=0)
filtered_normlog = np.log2(filtered_normcount + 0.1)
filtered_normlog.shape
(27606, 16435)
np.sum(np.sum(np.isnan(filtered_normlog)))
0
adata_clean_combined = anndata.AnnData(X = filtered_count.values,
obs = pd.DataFrame(index = [str(j) for j in filtered_count.index]),
var = pd.DataFrame(index = filtered_count.columns))
adata_clean_combined.layers['norm_count'] = filtered_normcount.values
adata_clean_combined.layers['norm_log'] = filtered_normlog.values
from sklearn.decomposition import PCA
pca_filtered = PCA(n_components=1000, svd_solver='randomized')
filtered_pcaproj = pd.DataFrame(pca_filtered.fit_transform(filtered_normlog),
index=filtered_normlog.index)
adata_clean_combined.obsm['X_pca'] = filtered_pcaproj.iloc[:, 0:50].values
filtered_pcaproj = pd.read_csv(outbase + 'filtered_pca_proj_combined.csv', index_col = 0)
filtered_pcaproj.shape
(27606, 1000)
tsne_proj_combined = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/results/cleanup_exp2_050419/filtered_combined_tsne.csv',
index_col = 0)
adata_clean_combined.obsm['X_tsne'] = tsne_proj_combined.values
communities = {}
for k in [20, 25, 30, 35, 40, 45]:
print(k)
communities[str(k)], _, _ = phenograph.cluster(filtered_pcaproj.iloc[:,0:50],k=k)
20 Finding 20 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 5.146052122116089 seconds Jaccard graph constructed in 5.809924125671387 seconds Wrote graph to binary file in 0.42493414878845215 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.9312 Louvain completed 21 runs in 11.947348833084106 seconds Sorting communities by size, please wait ... PhenoGraph completed in 24.57544708251953 seconds 25 Finding 25 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 4.076868057250977 seconds Jaccard graph constructed in 7.407986164093018 seconds Wrote graph to binary file in 0.6505348682403564 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.928252 After 3 runs, maximum modularity is Q = 0.929731 Louvain completed 23 runs in 16.358815908432007 seconds Sorting communities by size, please wait ... PhenoGraph completed in 30.14313006401062 seconds 30 Finding 30 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 5.161328077316284 seconds Jaccard graph constructed in 9.662767887115479 seconds Wrote graph to binary file in 0.7901201248168945 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.926594 After 8 runs, maximum modularity is Q = 0.92787 Louvain completed 28 runs in 29.430394887924194 seconds Sorting communities by size, please wait ... PhenoGraph completed in 46.66907000541687 seconds 35 Finding 35 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 5.677194833755493 seconds Jaccard graph constructed in 14.628026962280273 seconds Wrote graph to binary file in 1.4968199729919434 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.926007 Louvain completed 21 runs in 24.551576852798462 seconds Sorting communities by size, please wait ... PhenoGraph completed in 48.442304849624634 seconds 40 Finding 40 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 6.133985996246338 seconds Jaccard graph constructed in 19.400776386260986 seconds Wrote graph to binary file in 1.4717350006103516 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.92509 Louvain completed 21 runs in 29.556668758392334 seconds Sorting communities by size, please wait ... PhenoGraph completed in 58.82509398460388 seconds 45 Finding 45 nearest neighbors using minkowski metric and 'auto' algorithm Neighbors computed in 6.938416957855225 seconds Jaccard graph constructed in 15.81141185760498 seconds Wrote graph to binary file in 1.3379530906677246 seconds Running Louvain modularity optimization After 1 runs, maximum modularity is Q = 0.924237 Louvain completed 21 runs in 28.462908029556274 seconds Sorting communities by size, please wait ... PhenoGraph completed in 54.223273038864136 seconds
res_mat = np.zeros(shape = (6, 6))
for j, item in enumerate(communities.keys()):
for k, item2 in enumerate(communities.keys()):
res_mat[j, k] = adjusted_rand_score(communities[item], communities[item2])
plt.figure(figsize= (8, 6))
plt.imshow(res_mat, vmin = 0, vmax = 1, cmap = 'magma_r')
plt.colorbar()
plt.xticks(range(6), communities.keys());
plt.yticks(range(6), communities.keys());
#plt.grid()#which='minor', color='w', linestyle='-', linewidth=5)
plt.xlabel('k')
plt.ylabel('k')
plt.title('Adjusted Rand Index')
Text(0.5, 1.0, 'Adjusted Rand Index')
filtered_cluster_old = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/results/coarse_assn_exp2_050419/filtered_cluster_k30_50PCs.csv', index_col = 0, header = None)
filtered_cluster_old.loc[[int(j) for j in adata_clean_combined.obs.index]].iloc[:, 0].values
array([ 1, 4, 1, ..., 5, 5, 10])
adata_clean_combined.obs['Clusters'] = filtered_cluster_old.loc[[int(j) for j in adata_clean_combined.obs.index]].iloc[:, 0].values
from sklearn.decomposition import PCA
from MulticoreTSNE import MulticoreTSNE as TSNE
fig = plt.figure(figsize = (8*2, 6*1))
ax = fig.add_subplot(1, 2, 1)
old_cluster = filtered_cluster_old.iloc[:, 0].values
for j in np.unique(old_cluster):
ax.scatter(tsne_proj_combined['x'][old_cluster == j], tsne_proj_combined['y'][old_cluster == j],
s = 1, c = cols[j], label = str(j))
ax.legend(markerscale = 10, fontsize = 10, ncol = 2, bbox_to_anchor = [0, 0, 1.2, 1])
ax.axis('off');
#ax.set_title('Old clusters')
genes = np.unique(['PTPRC', 'CD3E', 'SFTPB','AGER','CLIC5','EPCAM','GP6','CFTR','MCPT8','CD200R3','ADGRE1','CD3G',
'MT-ATP6','HSPA1A','SPI1', 'LY6C1','FOXJ1','SIGLECH','CD68','FLT3', 'MS4A1','ITGAM','ITGAX','CD14',
'PDPN','PECAM1','SFTPB', 'PTPRC','CD3G','TRAC','NCR1', 'CSF3R','CSF1R','MERTK','ADGRE1','CCR2', 'SIRPA','ITGAX','XCR1',
'CLEC9A','PDPN','PECAM1','LYVE1','SFTPB','MT-ATP6'])
genes = list(set(genes).difference(set(['MT-ATP6','HSPA1A'])))
len(genes)
34
fig = plt.figure(figsize = (8*6, 6*6))
for j, item in enumerate(genes):
ax = fig.add_subplot(6, 6, j + 1)
c = filtered_normlog[item]
im1 = ax.scatter(tsne_proj_combined['x'], tsne_proj_combined['y'],
c = c, vmin = np.percentile(c, 1), vmax = np.percentile(c, 99), s = 0.5,
cmap = 'viridis')
ax.axis('off')
ax.set_title(item)
fig.colorbar(im1)
sub_data = filtered_normlog[genes].copy()
sub_data['Cluster'] = list(filtered_cluster_old.iloc[:, 0].values)
sub_data_avg = sub_data.groupby('Cluster').mean()
import matplotlib as mpl
mpl.style.use('default')
g = sns.clustermap(sub_data_avg, standard_scale = 1,col_cluster = True,
figsize = (12, 12), linewidth = 0.2, cbar_pos=(1, .3, .01, .4))
# Draw the legend bar for the classes
#for ct, label in enumerate([j for j in NEUMARKERS_dict.keys()]):
# g.ax_col_dendrogram.bar(0, 0, color=cols_use[ct],
# label=label, linewidth=0)
#g.ax_col_dendrogram.legend(ncol=1, bbox_to_anchor = (0, 0.8))
res = utils.run_diffusion_maps(filtered_pcaproj.iloc[:,0:50], knn = 30)
Determing nearest neighbor graph using euclidean distance metric
def impute_expression(T,expression,TSTEPS=4):
T_step = T
# impute values first, then keep stepping to speed it up
T_imputed = T_step.dot(expression.values)
for i in range(1,TSTEPS):
print('step {}'.format(i+1))
T_imputed = T_step.dot(T_imputed)
expression_imputed = pd.DataFrame(T_imputed,index=expression.index, columns=expression.columns)
return expression_imputed
filtered_imputed = impute_expression(res['T'], filtered_normlog, TSTEPS=4)
step 2 step 3 step 4
fig = plt.figure(figsize = (8*6, 6*6))
for j, item in enumerate(genes):
ax = fig.add_subplot(6, 6, j + 1)
c = filtered_imputed[item]
im1 = ax.scatter(tsne_proj_combined['x'], tsne_proj_combined['y'],
c = c, vmin = np.percentile(c, 1), vmax = np.percentile(c, 99), s = 0.5,
cmap = 'viridis')
ax.axis('off')
ax.set_title(item)
fig.colorbar(im1)
sub_data = filtered_imputed[genes].copy()
sub_data['Cluster'] = list(filtered_cluster_old.iloc[:, 0].values)
sub_data_avg = sub_data.groupby('Cluster').mean()
import matplotlib as mpl
mpl.style.use('default')
g = sns.clustermap(sub_data_avg, standard_scale = 1,col_cluster = True,
figsize = (12, 12), linewidth = 0.2, cbar_pos=(1, .3, .01, .4))
# Draw the legend bar for the classes
#for ct, label in enumerate([j for j in NEUMARKERS_dict.keys()]):
# g.ax_col_dendrogram.bar(0, 0, color=cols_use[ct],
# label=label, linewidth=0)
#g.ax_col_dendrogram.legend(ncol=1, bbox_to_anchor = (0, 0.8))
# coarse assignment
assn_dict = {'T/NK cell':[0,8,9,14,17,24,33],
'B cell':[1],
'Mono/Mac/DC':[3,4,12,16,18,30],
'Neutrophil':[6,7],
'Fibroblast':[2,10,13,22,23,26,28],
'Lymphatic Endothelial':[15],
'Blood Endothelial':[5,11,25,29],
'AT1/AT2':[19,21,34],
'Ciliated':[32],
'Platelet':[27],
'Basophil':[31],
'Low Quality':[20]}
# individual cell assignments
assn_clustmap = {}
for a,clusts in assn_dict.items():
for c in clusts:
assn_clustmap[c] = a
filtered_assn = filtered_cluster_old.iloc[:, 0].map(assn_clustmap)
adata_clean_combined.obs['Celltype'] = filtered_assn.loc[[int(j) for j in adata_clean_combined.obs.index]].values
from matplotlib.cm import get_cmap
cols_use = get_cmap('tab20').colors
cols_use[0]
(0.12156862745098039, 0.4666666666666667, 0.7058823529411765)
fig = plt.figure(figsize = (8*1, 8*1))
ax = fig.add_subplot(1, 1, 1)
cell_type = filtered_assn.values
for j, item in enumerate(np.unique(cell_type)):
ax.scatter(tsne_proj_combined['x'][cell_type == item], tsne_proj_combined['y'][cell_type == item],
s = 1, color = cols_use[j], label = item)
ax.legend(markerscale = 7, fontsize = 14, ncol = 1, bbox_to_anchor=(0, 0, 1.5, 1.08), loc="right")
ax.axis('off');
#ax.set_title('Old clusters')
fig = plt.figure(figsize = (8*1, 8*1))
ax = fig.add_subplot(1, 1, 1)
cell_type = adata_clean_combined.obs['Celltype']
for j, item in enumerate(np.unique(cell_type)):
ax.scatter(adata_clean_combined.obsm['X_tsne'][cell_type == item, 0],
adata_clean_combined.obsm['X_tsne'][cell_type == item, 1],
s = 1, color = cols_use[j], label = item)
ax.legend(markerscale = 7, fontsize = 14, ncol = 1, bbox_to_anchor=(0, 0, 1.5, 1.08), loc="right")
ax.axis('off');
#ax.set_title('Old clusters')
filtered_info
| SampleID | name | background | condition | replicate | experiment | ncells | libsize | |
|---|---|---|---|---|---|---|---|---|
| 120703423765940000 | 2_DT_1_CD45_minus | 2_DT_1_CD45plus | CD45+ | DT | 1 | KRAS_DT_2 | 3379 | 2831.0 |
| 120703424032052000 | 2_DT_1_CD45_minus | 2_DT_1_CD45plus | CD45+ | DT | 1 | KRAS_DT_2 | 3379 | 9806.0 |
| 120703424055725000 | 2_DT_1_CD45_minus | 2_DT_1_CD45plus | CD45+ | DT | 1 | KRAS_DT_2 | 3379 | 2198.0 |
| 120703436053422000 | 2_DT_1_CD45_minus | 2_DT_1_CD45plus | CD45+ | DT | 1 | KRAS_DT_2 | 3379 | 8404.0 |
| 120703436147574000 | 2_DT_1_CD45_minus | 2_DT_1_CD45plus | CD45+ | DT | 1 | KRAS_DT_2 | 3379 | 11526.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 241106421208805011 | ctl_3_CD45_minus | 2_ctl_3_CD45minus | CD45- | CTRL | 3 | KRAS_DT_2 | 1695 | 2386.0 |
| 241114562291571011 | ctl_3_CD45_minus | 2_ctl_3_CD45minus | CD45- | CTRL | 3 | KRAS_DT_2 | 1695 | 2862.0 |
| 241114589027246011 | ctl_3_CD45_minus | 2_ctl_3_CD45minus | CD45- | CTRL | 3 | KRAS_DT_2 | 1695 | 3799.0 |
| 241114589063987011 | ctl_3_CD45_minus | 2_ctl_3_CD45minus | CD45- | CTRL | 3 | KRAS_DT_2 | 1695 | 2668.0 |
| 241114608720683011 | ctl_3_CD45_minus | 2_ctl_3_CD45minus | CD45- | CTRL | 3 | KRAS_DT_2 | 1695 | 7576.0 |
27606 rows × 8 columns
for item in filtered_info.columns:
adata_clean_combined.obs[item] = filtered_info.loc[[int(j) for j in adata_clean_combined.obs.index]][item].values
confounders = pd.read_csv('/Users/rs3380/Dropbox/lung_tumor/results/cleanup_exp2_050419/confounders_050419.csv',index_col=0)
confounders.columns
Index(['SampleID', 'name', 'background', 'condition', 'replicate',
'experiment', 'ncells', 'libsize', 'mito_fraction', 'ribo_fraction',
'mhc_fraction', 'actin_fraction', 'cytoskeleton_fraction',
'malat1_fraction', 'scrublet_predict', 'scrublet_score', 'lowlibsize'],
dtype='object')
adata_clean_combined.obs.columns
Index(['Clusters', 'Celltype', 'SampleID', 'name', 'background', 'condition',
'replicate', 'experiment', 'ncells', 'libsize'],
dtype='object')
for item in confounders.columns:
adata_clean_combined.obs[item] = confounders.loc[[int(j) for j in adata_clean_combined.obs.index]][item].values
counts_df.shape
(32862, 16464)
fig = plt.figure(figsize = (8*2, 8*1))
ax = fig.add_subplot(1, 2, 1)
doublet_predict = confounders['scrublet_predict']
ax.scatter(tsne_proj_combined['x'][doublet_predict], tsne_proj_combined['y'][doublet_predict],
s = 10, color = 'r', label = 'Doublet')
ax.scatter(tsne_proj_combined['x'][~doublet_predict], tsne_proj_combined['y'][~doublet_predict],
s = 1, color = 'grey', label = 'Singlet')
ax.legend(markerscale = 1, fontsize = 14, ncol = 1, bbox_to_anchor=(0, 0, 1.2, 1.08), loc="right")
ax.axis('off');
#ax.set_title('Old clusters')
ax = fig.add_subplot(1, 2, 2)
doublet_predict = adata_clean_combined.obs['scrublet_predict']
ax.scatter(adata_clean_combined.obsm['X_tsne'][doublet_predict, 0],
adata_clean_combined.obsm['X_tsne'][doublet_predict, 1],
s = 10, color = 'r', label = 'Doublet')
ax.scatter(adata_clean_combined.obsm['X_tsne'][~doublet_predict, 0],
adata_clean_combined.obsm['X_tsne'][~doublet_predict, 1],
s = 1, color = 'grey', label = 'Singlet')
ax.legend(markerscale = 1, fontsize = 14, ncol = 1, bbox_to_anchor=(0, 0, 1.2, 1.08), loc="right")
ax.axis('off');
#ax.set_title('Old clusters')
adata_clean_combined.write_h5ad(outbase + 'clean_combined.h5ad')
... storing 'Celltype' as categorical ... storing 'SampleID' as categorical ... storing 'name' as categorical ... storing 'background' as categorical ... storing 'condition' as categorical ... storing 'experiment' as categorical
Please see the part 2 of this file for rest of the analysis.